### Libraries
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(cowplot)
library(gganimate)
library(ggridges)
library(viridis)
library(ggtech) # For the airbnb theme
library(extrafont) # To add the font for Airbnb theme
library(GGally) # Grapihical correlogram
library(knitr) # Nice tables-kables

# Download this fonts and install manually
# https://github.com/ricardo-bion/ggtech/blob/master/Circular%20Air-Medium%203.46.45%20PM.ttf
# https://github.com/ricardo-bion/ggtech/blob/master/Circular%20Air-Bold%203.46.45%20PM.ttf

# Import all fonts, takes a few minutes but removes the annoying warnings
# font_import()
# font_import(pattern = 'Circular', prompt=FALSE)
# loadfonts(device = "win")

# Lets set the theme to this nice one from Airbnb
# Colors: c("#FF5A5F", "#FFB400", "#007A87", "#FFAA91", "#7B0051")
theme_set(theme_airbnb_fancy())
pink = "#FF5A5F"
orange = "#FFB400"
blueGreen = "#007A87"
flesh = "#FFAA91"
purple = "#7B0051"
options(scipen=999) #avoiding e10 notation
### Import data
reviews <- 'data/bgg-13m-reviews.csv' %>%
  read_csv()

games <- 'data/games_detailed_info.csv' %>%
  read_csv()
### Format the data 
# Drop the index
a <- colnames(reviews)
kable(a)
x
X1
user
rating
comment
ID
name
reviews <- reviews %>% 
  select(-X1)


# Rename and drop some variables
a <- colnames(games)
kable(a)
x
X1
Abstract Game Rank
Accessory Rank
Amiga Rank
Arcade Rank
Atari ST Rank
Board Game Rank
Children’s Game Rank
Commodore 64 Rank
Customizable Rank
Family Game Rank
Party Game Rank
RPG Item Rank
Strategy Game Rank
Thematic Rank
Video Game Rank
War Game Rank
alternate
average
averageweight
bayesaverage
boardgameartist
boardgamecategory
boardgamecompilation
boardgamedesigner
boardgameexpansion
boardgamefamily
boardgameimplementation
boardgameintegration
boardgamemechanic
boardgamepublisher
description
id
image
maxplayers
maxplaytime
median
minage
minplayers
minplaytime
numcomments
numweights
owned
playingtime
primary
stddev
suggested_language_dependence
suggested_num_players
suggested_playerage
thumbnail
trading
type
usersrated
wanting
wishing
yearpublished
games <- games %>%
  select(
    "Abstract_Rank" = "Abstract Game Rank", 
    "Rank" = "Board Game Rank",              
    "Childrens_Rank" = "Children's Game Rank",  
    "Family_Rank" = "Family Game Rank",            
    "Party_Rank" = "Party Game Rank",             
    "Strategy_Rank" = "Strategy Game Rank",
    "Thematic_Rank" = "Thematic Rank",               
    "War_Game_Rank" = "War Game Rank",                             
    "Ratings_average" = "average",                      
    "Complexity" = "averageweight",                
    "Geekscore" = "bayesaverage",
    "Category" = "boardgamecategory",
    "Mechanics" = "boardgamemechanic",           
    'Publisher' = "boardgamepublisher",
    'Description' = "description",                 
    "id",        
    "image",                    
    "maxplayers",                  
    "maxplaytime",                 
    "minage",     
    "minplayers",          
    "minplaytime",    
    "numcomments",    
    "numweights",
    "owned",         
    "playingtime",                 
    "name" = "primary",                     
    "Ratings_std_dev" = "stddev",         
    "thumbnail",          
    "trading",                      
    "usersrated",        
    "wanting",
    "wishing", 
    "yearpublished"
  )
# Basic filters
# Lets remove some outliers to make life easier
# Some outliers are ok, eg. for most popular games the "wishing" will be very high
games <- games %>% 
  filter(yearpublished > 1950,
         maxplayers < 20,
         maxplaytime < 600,
         maxplaytime < 600,
         playingtime < 600)

# Creat a binned complexity
games %>%
  ggplot(aes(x = Complexity)) +
  geom_histogram(bins = 60)

# Values are from 1-5, but there is very little after 4.5
games$ComplexityBinned <- games$Complexity %>% 
  cut(
    breaks = c(-Inf, 1.5, 2.5, 3.25, Inf),
    labels = c(1, 2, 3, 4))

a <- table(games$ComplexityBinned)
kable(a)
Var1 Freq
1 5487
2 6586
3 2992
4 1274
# Create binary variables 
# A game can have multiple main categories, so we can't create one column
games$isAbstract <- as.integer( !is.na(games$Abstract_Rank) )
games$isChildrens <- as.integer( !is.na(games$Childrens_Rank) )
games$isFamily <- as.integer( !is.na(games$Family_Rank) )
games$isParty <- as.integer( !is.na(games$Party_Rank) )
games$isStrategy <- as.integer( !is.na(games$Strategy_Rank) )
games$isThematic <- as.integer( !is.na(games$Thematic_Rank) )
games$isWarGame <- as.integer( !is.na(games$War_Game_Rank) )

# Some overlap between categories
a <- games %>%
  group_by(isAbstract,
           isChildrens,
           isFamily,
           isParty,
           isStrategy,
           isThematic,
           isWarGame) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

kable(a)
isAbstract isChildrens isFamily isParty isStrategy isThematic isWarGame n
0 0 0 0 0 0 0 8094
0 0 0 0 0 0 1 2548
0 0 0 0 1 0 0 1247
0 0 1 0 0 0 0 1036
1 0 0 0 0 0 0 719
0 1 0 0 0 0 0 600
0 0 0 0 0 1 0 576
0 0 1 0 1 0 0 311
0 0 0 1 0 0 0 302
0 0 0 0 1 1 0 209
0 0 1 1 0 0 0 117
0 0 0 0 0 1 1 108
0 0 1 0 0 1 0 96
0 1 1 0 0 0 0 88
1 0 1 0 0 0 0 87
0 0 0 0 1 0 1 70
1 0 0 0 1 0 0 39
0 0 0 1 0 1 0 33
1 1 0 0 0 0 0 20
1 0 0 0 0 0 1 10
0 0 0 1 1 0 0 6
0 0 1 0 0 0 1 5
0 1 0 1 0 0 0 5
0 0 0 0 1 1 1 4
0 1 0 0 0 0 1 4
0 1 1 1 0 0 0 1
1 0 0 0 0 1 0 1
1 0 0 0 1 1 0 1
1 0 0 1 0 0 0 1
1 1 0 0 0 0 1 1
# Convert category ranks to quantiles
games$quantileAbstract <- games %>%
  .$Abstract_Rank / max(games$Abstract_Rank, na.rm = TRUE)

games$quantileChildrens <- games %>%
  .$Childrens_Rank / max(games$Childrens_Rank, na.rm = TRUE)

games$quantileFamily <- games %>%
  .$Family_Rank / max(games$Family_Rank, na.rm = TRUE)

games$quantileParty <- games %>%
  .$Party_Rank / max(games$Party_Rank, na.rm = TRUE)

games$quantileStrategy <- games %>%
  .$Strategy_Rank / max(games$Strategy_Rank, na.rm = TRUE)

games$quantileThematic <- games %>%
  .$Thematic_Rank / max(games$Thematic_Rank, na.rm = TRUE)

games$quantileWarGame <- games %>%
  .$War_Game_Rank / max(games$War_Game_Rank, na.rm = TRUE)
# Find the primary category
# How? By the lowest quantile, eg. the most important in this catergory
find_lowest_quantile <- function(row, output) {
  abstract = row["quantileAbstract"]
  family = row["quantileChildrens"]
  child = row["quantileFamily"]
  party = row["quantileParty"]
  strategy = row["quantileStrategy"]
  thematic = row["quantileThematic"]
  war = row["quantileWarGame"]
  other = 1
  
  lowest <- min(
    abstract,
    family,
    child,
    party,
    strategy,
    thematic,
    war,
    other,
    na.rm = TRUE)
  
  case_when(
    lowest == abstract ~ 'Abstract',
    lowest == family ~ 'Family',
    lowest == child ~ "Children",
    lowest == party ~ 'Party',
    lowest == strategy ~ 'Strategy',
    lowest == thematic ~ 'Thematic',
    lowest == war ~ 'War Game',
    lowest == 1 ~ 'Other'
  ) %>%
    return()
}

# Takes a while
games$cat <- games %>%
  apply(1, find_lowest_quantile) %>%
  as.factor()

# A nice distribution, a lot of other 
a <- games$cat %>% table()
kable(a)
. Freq
Abstract 852
Children 1453
Family 718
Other 8098
Party 445
Strategy 1249
Thematic 791
War Game 2733
my_dens <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    geom_density_ridges(aes(y = games$ComplexityBinned, fill = games$ComplexityBinned), scale = 3, rel_min_height = 0.01, alpha = 0.7, color = NA ) + 
    scale_fill_manual(values = c("#FF5A5F", "#FFB400", "#007A87", "#FFAA91", "#7B0051")) 
}

my_dots <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    geom_point(alpha = 0.1) + 
    scale_color_tech(theme="airbnb") 
}

games %>%
  select( "Rank",
          "Ratings_average",
          "yearpublished",
          "usersrated",
          "playingtime",   
          "usersrated") %>%
  ggpairs(
    mapping = aes(color = games$ComplexityBinned, fill = games$ComplexityBinned),
    diag = list(continuous = my_dens),
    lower = list(continuous = my_dots),
    progress = TRUE,
    axisLabels = 'none',
    legend = 1) + 
  theme(text = element_text(size = 10)) +
  labs(title = '')

# 002 - Boardgames per year
games %>%
  ggplot(aes(x = yearpublished)) +
  geom_density(color = orange, fill = orange, alpha = 0.5) +
  labs(title = 'Board games published per year', x = 'Year', y = 'Density')

# 003 - Average ratings vs year (>2009)
plot <- games %>% 
  filter(yearpublished > 2009) %>%
  ggplot(aes(x = Ratings_average)) +
  geom_density(color = orange, fill = orange, alpha = 0.5) +  
  labs(title = 'Ratings of games published in: {floor(frame_time)}', x = 'Ratings', y = 'Density') +
  transition_time(yearpublished) +
  ease_aes('linear') 

plot %>% animate(width = 700, height = 300)

games %>%
  ggplot(aes(x = Ratings_average, y = ComplexityBinned, fill = ComplexityBinned)) +
  geom_density_ridges(scale = 3, rel_min_height = 0.01, alpha = 0.7, color = NA) + 
  scale_fill_manual(values = c("#FF5A5F", "#FFB400", "#007A87", "#FFAA91", "#7B0051")) +
  scale_x_continuous(limits = c(3, 9.3)) +
  labs(title = 'Does the complexity of the game infuence the ratings?',
       x = 'Ratings',
       y = 'Complexity of the game',
       fill = 'Complexity')
Picking joint bandwidth of 0.131

games %>% 
  ggplot(aes(x = Ratings_average, y = Rank)) +
  geom_point(aes(colour = Rank), alpha = 0.2) +
  scale_color_gradient(low = orange, high = purple) +
  labs(title = 'Does the average rating influence the final rank?',
       subtitle = 'Chicken Drumstick distribution',
       x = 'Average rating',
       y = 'Overall rank') +
  theme(legend.position = 'none') +
  scale_y_reverse()

games %>% 
  ggplot(aes(x = log10(usersrated), y = Rank)) +
  geom_point(aes(colour = Rank), alpha = 0.3) + 
  scale_color_gradient(low = orange, high = purple) +
  labs(title = 'Does the number of user ratings influence the final rank?',
       subtitle = 'Jaws distribution',
       x = 'Log 10 of the number of ratings ',
       y = 'Overall rank') +
  theme(legend.position = 'none') +
  scale_y_reverse()

# 007 rank vs category
games$isTop <- ifelse(games$Rank < 300, 1, 0)

games %>%
  ggplot(aes(x = Rank, fill = cat)) +
  geom_density(alpha = 0.7, color = NA) + 
  facet_wrap('cat') + 
  scale_fill_manual(values = c("#FF5A5F", "#FFB400", "#007A87", "#FFAA91", "#7B0051", "#FF5A5F", "#FFB400", "#007A87")) +
  theme(legend.position = 'none') +
  labs(title = 'Does the main category of the game infuence the rank?',
       subtitle = 'All games',
       x = 'Rank (lower is better)',
       y = 'Density')

games %>%
  filter(isTop == 1) %>%
  ggplot(aes(x = Rank, fill = cat)) +
  geom_histogram(alpha = 0.7, color = NA, bins = 30) + 
  facet_wrap('cat') + 
  scale_fill_manual(values = c("#FF5A5F", "#FFB400", "#007A87", "#FFAA91", "#7B0051", "#FF5A5F", "#FFB400", "#007A87")) +
  theme(legend.position = 'none') +
  labs(title = 'Does the main category of the game infuence the rank?',
       subtitle = 'Top 300 games',
       x = 'Rank (lower is better)',
       y = 'Count')

# See the top 1% of abstract games, and their rank
a <- games %>%
  filter(quantileAbstract < 0.01 ) %>%
  {paste0(.$name, ' ' , .$Rank)}
kable(a)
x
Patchwork 64
Azul 37
Santorini 101
Sagrada 125
Onitama 205
YINSH 157
DVONN 386
TZAAR 357
# 008 Wishing vs rank
games %>%
  filter(wishing > 10) %>%
  ggplot(aes(x = log10(wishing), y = Rank)) +
  geom_point(aes(colour = Rank), alpha = 0.3) + 
  geom_smooth(color = purple) + 
  scale_color_gradient(low = orange, high = purple) +
  labs(title = 'Does the number of user wishlisting influence the final rank?',
       subtitle = 'Removed obeservations with < 10 users',
       x = 'Log 10 of the number of users wishlisting',
       y = 'Overall rank') +
  theme(legend.position = 'none') +
  scale_y_reverse()

# 009 - what does not influence 
# % of written comments
games %>%
  filter(numcomments / usersrated < 1) %>% 
  ggplot(aes(x = numcomments / usersrated , y = Rank)) +
  geom_point(aes(colour = Rank), alpha = 0.5) + 
  scale_color_gradient(low = orange, high = purple) +
  labs(title = '% of ratings with written reviews',
       x = 'Number of comments / Number of ratings',
       y = 'Overall rank') +
  theme(legend.position = 'none') +
  scale_y_reverse()

# Std dev 
games %>%
  ggplot(aes(x = Ratings_std_dev, y = Rank))+
  geom_point(aes(colour = Rank), alpha = 0.5) + 
  scale_color_gradient(low = orange, high = purple) +
  labs(title = 'How controversial the game is',
       x = 'Standard deviation of ratings',
       y = 'Overall rank') +
  theme(legend.position = 'none') +
  scale_y_reverse()

games %>%
  filter(Ratings_std_dev > 0.75,
         Ratings_std_dev < 2.5) %>%
  ggplot(aes(x = Ratings_std_dev, y = Rank)) +
  stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) + 
  scale_fill_gradient(high = orange, low = purple)